rm(list=ls())
setwd("E:/data/cooperation/government/final_data")##set the path of the data
##load the library
library(foreign)
library(xlsxjars)
library(xlsx)
library(ggplot2)
library(reshape2)
library(gridExtra)
library(readstata13)
library(randomForest)
##load the data----
da<-read.dta13("govern.dta")
lab<-labels(da)[[2]]
##select the main variables based on theories of innovation and government
va<-c("monetaryfreedom","al_ethnic","wdi_trade","governmentintegrity","lgdppc","p_durable","vdem_corr","kun_wiqreco_all","fe_cultdiv","lrd_pc",
      "urbaniz","lpop_dnst","laborfreedom","wbgi_pse","fincialfreedom","wbgi_gee","fh_fotpsc",
      "wdi_internetuse","taxburden","wdi_telephone","pat_uspt","rsf_pfi","wdi_internetserv",
      "ciri_polpris","fh_ipolity2","hf_efiscore","fh_rol","une_pee","al_language")
length(va)

da1<-da[,match(va,labels(da)[[2]])]
da$pat_uspt=log(da$pat_uspt)
da$wdi_internetuse=log(da$wdi_internetuse)
var<-c(va,"year","countrycode")# add the variables of year and countrycode
num<-match(var,lab) 
da<-da[,num]
write.dta(da,file="original1.dta")
##machine learning----
result<-randomForest(wbgi_gee ~.,data=da1,importance=TRUE, na.action=na.omit,ntree=1000)
im<-importance(result)##Table 1A: The Importance Degree of Variables from Random Forest
write.csv(im,file="importance2.csv")
#drop the less important variables
va<-c("wdi_trade","lgdppc","vdem_corr","governmentintegrity","fe_cultdiv",
      "lpop_dnst","wbgi_pse","wbgi_gee","taxburden","wdi_telephone","pat_uspt","wdi_internetserv",
      "ciri_polpris","fh_ipolity2","hf_efiscore","rq","wdi_internetuse")

var<-c(va,"year","countrycode")# add the variables of year and countrycode
num<-match(var,lab) 
da<-da[,num]
write.dta(da,file="original.dta")##save the original data after machine learning

##analyse the balance of data 
da2<-split(da,da$countrycode)  #split the data based on countrycode 
country<-labels(da2)#countrycode names
re<-colSums(is.na(da2[[1]]))/dim(da2[[1]])[1]
for (i in 2:length(country))
{
  re1<-colSums(is.na(da2[[i]]))/dim(da2[[i]])[1]
  re<-cbind(re,re1)
}
colnames(re)<-country
write.csv(re,file="balance_country.csv")#summary the balance of the data based on countrycode
re_country<-re
ba_country<-colMeans(re_country)
ba_year1<-rowMeans(re_country)

##split the data based on year
da2<-split(da,da$year)  
year<-labels(da2)#year names
re<-colSums(is.na(da2[[1]]))/dim(da2[[1]])[1]
for (i in 2:length(year))
{
  re1<-colSums(is.na(da2[[i]]))/dim(da2[[i]])[1]
  re<-cbind(re,re1)
}
colnames(re)<-year
write.csv(re,file="balance_year.csv")#summary the balance of the data based on year
re_year<-re
ba_year<-colMeans(re_year)
ba_year1<-rowMeans(re_year)

##drop the data of the countrycode which is missed so much 
sum<-summary(ba_country)
length(ba_country)
na<-names(which(ba_country<sum[5]))##quantile of third
m=0
for (i in 1:length(na))
{
  n<-which(da$countrycode==na[i])
  m<-c(m,n)
}
m<-m[-1]
da<-da[m,]
write.dta(da,file="final_3rd.dta") #get the main database
##table 1
summary(ba_country)
summary(ba_year)
sd(ba_country)
sd(ba_year)
##fig 1----
data<-as.data.frame(ba_country)
ggplot(data,aes(x=ba_country))+
  geom_histogram(aes(y=..density..),bins = 100,fill="white",color="black")+
  stat_density(geom='line',position='identity',size=1,color="red")+
  geom_vline(xintercept=0.1743421,col="blue")+
  theme_bw()+
  labs(x="The proportion of missing data",y="Density",size=0.1)+
  theme(title=element_text(size=8))+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())
##save the countries which are selected
country<-names(table(da$countrycode))
country<-matrix(country,ncol=9)
length(country)
write.csv(country,file="country.csv")

##fig 2----
data<-read.dta("final_3rd.dta")
lpat_uspt<-log(data$pat_uspt)
data<-data.frame(data,lpat_uspt)
ggplot(data,aes(x=lpat_uspt))+
  geom_point(aes(y=wbgi_gee,color=year),size=1)+
  geom_smooth(aes(y=wbgi_gee),method="loess",color=alpha("red",0.5),size=1)+
  scale_colour_gradient(low = "grey100",high = "black")+
  theme_bw()+
  theme(title=element_text(size=10))+
  labs(y="Government ffectiveness",x="The log of patent grants")+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5),axis.line = element_line())

##add the region area----
da<-read.dta13("final_3rd.dta")
da1<-read.dta13("govern.dta")
da2<-data.frame(da1$year,da1$countrycode,da1$regioncode)
colnames(da2)<-c("year","countrycode","regioncode")
da1<-merge(da,da2,by=c("year","countrycode"))
write.dta(da1,file="final_3rd.dta")

##split the level of the democracy----
country<-names(table(da$countrycode))
FirstQu_polity<-vector(length=length(country))
Median_polity<-vector(length=length(country))
Mean_polity<-vector(length=length(country))
ThirdQu_polity<-vector(length=length(country))
group<-vector(length=length(country))
da_polity<-data.frame(country,FirstQu_polity,Median_polity,Mean_polity,ThirdQu_polity)
for (i in 1:length(country))
{
  d<-summary(da[which(da$countrycode==da_polity[i,1]),14])
  da_polity[i,2]<-d[[2]]
  da_polity[i,3]<-d[[3]]
  da_polity[i,4]<-d[[4]]
  da_polity[i,5]<-d[[5]]
}
me<-median(da_polity$Mean_polity)
da_polity<-data.frame(da_polity,group=group)
for (i in 1:length(country))
{
  da_polity$group<-as.numeric(da_polity$Mean_polity>=me)
}
highcountry<-da_polity[da_polity$group==1,1]
lowcountry<-da_polity[da_polity$group==0,1]
##fig 3----
ggplot(da_polity,aes(Mean_polity))+
  geom_histogram(aes(y=..density..),bins = 30,fill="white",color="black")+
  stat_density(geom='line',position='identity',size=1,color="grey20")+
  theme_bw()+
  labs(x="level of democracy for different countries")+
  theme(title=element_text(size=10))+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())
##get the level of the democracy into variable (group)
group<-vector(length=dim(da)[1])
da<-data.frame(da,group)
highcountry<-as.character(highcountry)
lowcountry<-as.character(lowcountry)
write.csv(highcountry,file="high.csv")
write.csv(lowcountry,file="low.csv")
for (i in 1:length(highcountry))
{
  da$group[which(da$countrycode==highcountry[i])]<-1
}

for (i in 1:length(lowcountry))
{
  da$group[which(da$countrycode==lowcountry[i])]<-0
}
write.dta(da,file="final_3rd_demo.dta")

##figure statistical -----
da<-read.dta("final_3rd.dta")
q1<-ggplot(da,aes(wbgi_gee))+
  geom_histogram(aes(y=..density..),bins = 50,fill="white",color="black")+
  stat_density(geom='line',position='identity',size=1,color="red")+
  theme_bw()+
  labs(title="government",x=NULL)+
  theme(title=element_text(size=10))+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())
q2<-ggplot(da,aes(lgdppc))+
  geom_histogram(aes(y=..density..),bins = 50,fill="white",color="black")+
  stat_density(geom='line',position='identity',size=1,color="red")+
  theme_bw()+
  labs(title="GDP",x=NULL)+
  theme(title=element_text(size=10))+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())
q3<-ggplot(da,aes(vdem_corr))+
  geom_histogram(aes(y=..density..),bins = 50,fill="white",color="black")+
  stat_density(geom='line',position='identity',size=1,color="red")+
  theme_bw()+
  labs(title="Coruption",x=NULL)+
  theme(title=element_text(size=10))+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())
q4<-ggplot(da,aes(wbgi_pse))+
  geom_histogram(aes(y=..density..),bins = 50,fill="white",color="black")+
  stat_density(geom='line',position='identity',size=1,color="red")+
  theme_bw()+
  labs(title="stability",x=NULL)+
  theme(title=element_text(size=10))+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())
q5<-ggplot(da,aes(pat_uspt))+
  geom_histogram(aes(y=..density..),bins = 50,fill="white",color="black")+
  stat_density(geom='line',position='identity',size=1,color="red")+
  theme_bw()+
  labs(title="patent",x=NULL)+
  theme(title=element_text(size=10))+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())
q6<-ggplot(da,aes(fh_ipolity2))+
  geom_histogram(aes(y=..density..),bins = 50,fill="white",color="black")+
  stat_density(geom='line',position='identity',size=1,color="red")+
  theme_bw()+
  labs(title="level of democracy for different countries",x=NULL)+
  theme(title=element_text(size=10))+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())
grid.arrange(q1,q2,q3,q4,q5,q6,ncol=3)

##check the number of the 
da<-read.dta13("final_3rd_demo.dta")
da1<-split(da,da$group)
da_0<-da1[[1]]
da_1<-da1[[2]]
length(names(table(da_0$countrycode)))
length(names(table(da_1$countrycode)))

##add the 


